Gender bias in audience of seminars and career position

Data

EcoEncontros Seminar talks

Talks from EcoEncontros Seminar series at the Graduate Program of Ecology in the University of SĂŁo Paulo (PPGE-USP), Brazil

See file metadata.txt, in folder data for more description and detail of the dataset.

data <- read.table("data/presentations_PPGE_2008-2019.csv", sep=",",
                   header=T, as.is=T)
data$date <- dmy(data$date)
data$year <- year(data$date) 
#skimr::skim(data)

Excluding special events as round tables and discussions not related to a project or study presented by someone.

IDs <- c(154, 250, 211, 289)
data <- data %>% filter(!id %in% IDs)

For this specific analysis, excluding speakers that are not in academia (“others”), and keeping undergraduate students, MD and PhD in the group student. postdoc, professor or researcher*.

*Researchers are included in the professor categorical position (column position_cat) because all of them come from research institutions.

data <- data %>% filter(position_cat != "others")
data$position_cat <- fct_relevel(data$position_cat, "student", 
                                 "postdoc","professor")

Excluding seminars with more than one speaker

events <- data %>% count(id) %>% filter(n>1)
data <- data %>% filter(!id %in% events$id,
                        !is.na(audience_n))
dim(data)
## [1] 299  31

Data description

Regarding the differences in the audience, there were 299 talks with the number of attendees, allowing us to conduct the analysis, as we excluded seminars with more than one presenter (round tables, and special seminars).

Audience by gender and academic position

ggplot(data, aes(x=position_cat, y=audience_n, fill=gender)) +
  scale_fill_manual(values = c("#b2abd2", "#fdb863"))+
  geom_boxplot()

  #geom_violin(position = position_dodge(0.8)) +
  #geom_jitter(position=position_jitterdodge(0.2),shape=21)
library(ggbeeswarm)
# outra opção
ggplot(data, aes(x=position_cat, y=audience_n, fill=gender)) +
  scale_fill_manual(values = c("#b2abd2", "#fdb863"))+
  scale_color_manual(values = c("#b2abd2", "#fdb863"))+
  geom_violin(col="black") +
  geom_quasirandom(dodge.width = 0.9, shape=21)+
  stat_summary(fun.y=median, aes(ymin=..y.., ymax=..y..),geom='errorbar', 
               width=0.8, size=0.8, position = position_dodge(width = 0.9))+
  xlab("") + ylab("Audience (N)")

Variation in time

ggplot(data, aes(x=date, y=audience_n, fill=gender)) +
  facet_wrap(~position_cat, ncol=1)+
  scale_fill_manual(values = c("#b2abd2", "#fdb863"))+
  scale_color_manual(values = c("#b2abd2", "#fdb863"))+
  geom_quasirandom(dodge.width = 0.9, shape=21)+
  geom_smooth()+
  xlab("") + ylab("Audience (N)")

Looking for possible biases for speakers from inside and outside PPGE.

data$ppge <- ifelse(data$origin == "IB", "inside", "outside")
table(data$gender,data$ppge)
##    
##     inside outside
##   F     76      49
##   M     79      95
ggplot(data, aes(x=ppge, y=audience_n, fill=gender)) +
  scale_fill_manual(values = c("#b2abd2", "#fdb863"))+
  scale_color_manual(values = c("#b2abd2", "#fdb863"))+
  geom_violin(col="black") +
  geom_quasirandom(dodge.width = 0.9, shape=21)+
  stat_summary(fun.y=median, aes(ymin=..y.., ymax=..y..),geom='errorbar', 
               width=0.8, size=0.8, position = position_dodge(width = 0.9))+
  xlab("PPGE") + ylab("Audience (N)")

Looking for possible biases for speakers from Brazil and abroad.

data$brazilian <- ifelse(data$country == "Brasil", "yes", "no")
table(data$gender,data$brazilian)
##    
##      no yes
##   F  22 103
##   M  50 124
ggplot(data, aes(x=brazilian, y=audience_n, fill=gender)) +
  scale_fill_manual(values = c("#b2abd2", "#fdb863"))+
  scale_color_manual(values = c("#b2abd2", "#fdb863"))+
  geom_violin(col="black") +
  geom_quasirandom(dodge.width = 0.9, shape=21)+
  stat_summary(fun.y=median, aes(ymin=..y.., ymax=..y..),geom='errorbar', 
               width=0.8, size=0.8, position = position_dodge(width = 0.9))+
  xlab("Brazilian") + ylab("Audience (N)")

Modeling

Negative binomial

data$affirm_action <- ifelse(data$year<2018,"before", "after")
data$affirm_action <- fct_relevel(data$affirm_action, "before", "after")
mg0 <- glm.nb(audience_n~ 1, data=data)
mg1 <- glm.nb(audience_n~ gender, data=data)
mg2 <- glm.nb(audience_n~ position_cat, data=data)
mg3 <- glm.nb(audience_n~ year, data=data)
mg3b <- glm.nb(audience_n~ affirm_action, data=data)

mg4 <- glm.nb(audience_n~ gender + position_cat, data=data)
mg5 <- glm.nb(audience_n~ gender + year, data=data)
mg5b <- glm.nb(audience_n~ gender + affirm_action, data=data)
mg6 <- glm.nb(audience_n~ year + position_cat, data=data)
mg6b <- glm.nb(audience_n~ affirm_action + position_cat, data=data)

mg7 <- glm.nb(audience_n~ gender*position_cat, data=data)
mg8 <- glm.nb(audience_n~ gender*year, data=data)
mg8b <- glm.nb(audience_n~ gender*affirm_action, data=data)
mg9 <- glm.nb(audience_n~ year*position_cat, data=data)
mg9b <- glm.nb(audience_n~ affirm_action*position_cat, data=data)

mg10 <- glm.nb(audience_n~ gender + position_cat + year, data=data)
mg10b <- glm.nb(audience_n~ gender + position_cat + affirm_action, data=data)
mg11 <- glm.nb(audience_n~ gender*position_cat + year, data=data)
mg11b <- glm.nb(audience_n~ gender*position_cat + affirm_action, data=data)
mg12 <- glm.nb(audience_n~ gender + position_cat * year, data=data)
mg12b <- glm.nb(audience_n~ gender + position_cat * affirm_action, data=data)
mg13 <- glm.nb(audience_n~ gender*year + position_cat, data=data)
mg13b <- glm.nb(audience_n~ gender*affirm_action + position_cat, data=data)


mg14 <- glm.nb(audience_n~ gender*position_cat*year, data=data)
mg14b <- glm.nb(audience_n~ gender*position_cat*affirm_action, data=data)

AICtab(mg2, mg0,mg1, mg3, mg4,mg5,mg6,mg7,mg8,mg9,mg10,mg11,mg12,mg13,mg14,
      mg3b,mg5b,mg6b,mg8b,mg9b,mg10b,mg11b,mg12b,mg13b, mg14b,
      base=T, weights=T) %>% kable(digits=2)
AIC dAIC df weight
mg11b 2168.07 0.00 8 0.48
mg10b 2169.26 1.19 6 0.26
mg13b 2171.26 3.19 7 0.10
mg12b 2172.47 4.41 8 0.05
mg11 2173.77 5.70 8 0.03
mg10 2174.21 6.14 6 0.02
mg6b 2174.87 6.81 5 0.02
mg14b 2175.67 7.60 13 0.01
mg7 2175.79 7.72 7 0.01
mg13 2176.19 8.12 7 0.01
mg4 2176.31 8.24 5 0.01
mg12 2177.82 9.75 8 0.00
mg9b 2178.54 10.47 7 0.00
mg6 2179.00 10.93 5 0.00
mg2 2181.06 13.00 4 0.00
mg14 2182.06 14.00 13 0.00
mg9 2182.24 14.17 7 0.00
mg5b 2202.12 34.05 4 0.00
mg8b 2203.83 35.76 5 0.00
mg5 2205.11 37.05 4 0.00
mg1 2206.19 38.13 3 0.00
mg8 2207.09 39.03 5 0.00
mg3b 2219.15 51.08 3 0.00
mg3 2220.65 52.59 3 0.00
mg0 2221.46 53.40 2 0.00

Residual diagnostic

hnp::hnp(mg11b)
## Negative binomial model (using MASS package)

plot(simulateResiduals(mg11b))

hnp::hnp(mg10b)
## Negative binomial model (using MASS package)

plot(simulateResiduals(mg10b))

Models result

The two equally plausible models for the audience included gender, academic position and affirmative actions as predictors, with the difference that the best fitted model includes an interaction of gender and academic position (R2 = 0.3)

summary(mg11b)
## 
## Call:
## glm.nb(formula = audience_n ~ gender * position_cat + affirm_action, 
##     data = data, init.theta = 6.652451909, link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.8934  -0.7775  -0.1330   0.4896   3.8037  
## 
## Coefficients:
##                               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                    2.80898    0.05405  51.973  < 2e-16 ***
## genderM                        0.11625    0.07453   1.560  0.11882    
## position_catpostdoc            0.11708    0.10696   1.095  0.27372    
## position_catprofessor          0.20059    0.10432   1.923  0.05449 .  
## affirm_actionafter             0.19567    0.06316   3.098  0.00195 ** 
## genderM:position_catpostdoc   -0.12030    0.14428  -0.834  0.40440    
## genderM:position_catprofessor  0.22897    0.12861   1.780  0.07503 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(6.6525) family taken to be 1)
## 
##     Null deviance: 376.12  on 298  degrees of freedom
## Residual deviance: 303.02  on 292  degrees of freedom
## AIC: 2168.1
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  6.652 
##           Std. Err.:  0.708 
## 
##  2 x log-likelihood:  -2152.067
performance::r2(mg11b)
## # R2 for Generalized Linear Regression
##   Nagelkerke's R2: 0.303

Average audience before affirmative actions:16.5929329

Average audience aftter affirmative actions: 20.179078

myg11b <- ggpredict(mg11b, terms=c("position_cat","gender", "affirm_action"))
as.data.frame(myg11b)%>% mutate(predicted = round(predicted,digits=0))
##            x predicted  std.error conf.low conf.high group  facet
## 1    student        17 0.05404728 14.92513  18.44711     F before
## 2    student        20 0.06992406 17.59470  23.14306     F  after
## 3    student        19 0.05736112 16.65651  20.85622     M before
## 4    student        23 0.06847745 19.81983  25.92245     M  after
## 5    postdoc        19 0.09474248 15.49256  22.46018     F before
## 6    postdoc        23 0.10566643 18.44179  27.90550     F  after
## 7    postdoc        19 0.07992838 15.88458  21.72931     M before
## 8    postdoc        23 0.09796456 18.64668  27.37642     M  after
## 9  professor        20 0.09257254 16.91385  24.31298     F before
## 10 professor        25 0.10078386 20.24096  30.04732     F  after
## 11 professor        29 0.05126210 25.90194  31.66658     M before
## 12 professor        35 0.07476636 30.08179  40.32609     M  after

Complete figure

prs <- as.data.frame(myg11b) %>% rename(affirm_action = facet)
colnames(prs)[1] <- "position_cat"
ggplot(data, aes(x=position_cat, y=audience_n)) +
  geom_point(aes(col=gender), position = position_dodge(0.6), alpha=0.3,
             size=3,show.legend = F) +
  facet_grid(~affirm_action, 
             labeller = as_labeller(c("before"="Before affirmative actions" , "after"="After affirmative actions" ))) +
 #scale_color_manual(values = c("#b2abd2", "#fdb863")) +
  scale_color_manual(values = c("#6D57CF","#FCA532")) +
  scale_fill_manual(name="Gender", values = c("#6D57CF","#FCA532")) +
  geom_pointrange(data=prs, aes(x=position_cat, y=predicted,fill=group,
                                ymax=conf.high, ymin=conf.low), alpha=1,
             position=position_dodge(0.6), size=1, shape=21, col="black") +
  xlab("Academic position") + ylab("Audience (N)") 

ggsave("figures/audience_speakers.jpeg", width=8, height = 4)  
ggplot(data, aes(x=affirm_action, y=audience_n)) +
  geom_point(aes(col=gender), position = position_dodge(0.6), alpha=0.3,
             size=3,show.legend = F) +
  facet_grid(~position_cat, labeller = 
               as_labeller(c(student = "PĂłs-graduande",
                             postdoc =  "PĂłs-doc",
                             professor = "Docente"))) +
  scale_color_manual(values = c("#6D57CF","#FCA532")) +
  scale_x_discrete(labels = c("Antes", "Depois"))+
  scale_fill_manual(name="GĂȘnero", values = c("#6D57CF","#FCA532")) +
  geom_pointrange(data=prs, aes(x=affirm_action, y=predicted,fill=group,
                                ymax=conf.high, ymin=conf.low), alpha=1,
             position=position_dodge(0.6), size=1, shape=21, col="black") +
    xlab("AçÔes afirmativas") + ylab("AudiĂȘncia (N)") 

ggsave("figures/audience_speakers_b_port.jpeg", width=8, height = 4)  
summary(mg10b)
## 
## Call:
## glm.nb(formula = audience_n ~ gender + position_cat + affirm_action, 
##     data = data, init.theta = 6.502702402, link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.8101  -0.7831  -0.1381   0.4704   3.9188  
## 
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)            2.79293    0.04972  56.177  < 2e-16 ***
## genderM                0.15356    0.05463   2.811  0.00494 ** 
## position_catpostdoc    0.04418    0.07250   0.609  0.54229    
## position_catprofessor  0.36571    0.06046   6.049 1.46e-09 ***
## affirm_actionafter     0.18918    0.06316   2.995  0.00274 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(6.5027) family taken to be 1)
## 
##     Null deviance: 369.75  on 298  degrees of freedom
## Residual deviance: 303.07  on 294  degrees of freedom
## AIC: 2169.3
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  6.503 
##           Std. Err.:  0.688 
## 
##  2 x log-likelihood:  -2157.260
performance::r2(mg10b)
## # R2 for Generalized Linear Regression
##   Nagelkerke's R2: 0.282
myg10b <- ggpredict(mg10b, terms=c("position_cat","gender", "affirm_action"))
plot(myg10b) +
  scale_color_manual(values = c("#b2abd2", "#fdb863"))

Only professors - productivity metrics

Investigating if differences in productivity between male and female professors and researches are related to the audience.

Measured productivity publication metrics from Google Scholar for professors and researchers.

Creating productivity index using PCA 1st axis from metrics.

PCA productivity metrics

dp <- data %>% filter(!is.na(data$total_citation_n),
                      !is.na(data$nature_index_count))
table(dp$gender, dp$affirm_action)
##    
##     before after
##   F     14     6
##   M     58     9

Productivity publication metrics

pca1 <- PCA(dp[, c(22:29)], graph=F)
p1 <- fviz_pca_biplot(pca1, col.ind = dp$gender, addEllipses=TRUE,
                      col.ind.sub="none",  geom="point",
                      repel = TRUE) +
  geom_vline(xintercept = 0, linetype="dashed") + 
  geom_hline(yintercept = 0, linetype="dashed")+
  scale_color_manual(name="gender",values = c("#6D57CF","#FCA532"))+
  scale_shape(name="gender")+
  scale_fill_manual(name="gender",values = c("#6D57CF","#FCA532"))+
  ggtitle("PCA biplot for professor's productivy metrics") +
  xlab("PC1 (52%)") + ylab("PC2 (21%") +
  theme_cowplot()

p1

ggsave("figures/pca_biplot.jpeg", width=8, height = 8)  

For the analysis specific for professor talks (N=87), the PCA results show that all the productivity metrics for professors were highly correlated (Figure 2B) with the first axis (52%of variance explained) while the institution indexes composed the second PCA axis (21% of variation explained).

Extracting PCA 2 first axes

dp$pc1 <- pca1$ind$coord[,1]
dp$pc2 <- pca1$ind$coord[,2]

Modeling

We used generalized linear models with negative binomial distribution, given the large variation in the audience (range between 4 and 101).

OBS interna (apagar depois): podem perguntar porque a gente nao colocou o affirm_action no modelo jå que é importane para prever a audni6encia (como vimos no modelo anterior), eu até fiz uns testes, mas eu nao acho que devamos complicar este modelo, jå que o foco aqui é comaprar professores quanto à produtividade, com hipótese clara de que dada uma mesma produtivdade ainda assim o mulheres vão ter menor audiencia que homens. Então é bom deixar isso claro no texto - de que estamos nessa anålise focando apenas em genero e métrica de produtividade e por isso nao tem tempo nesses modelos.Eu também preferi colocar pc1 e pc2 sempre juntos como variåvel de produtividade - sem fazer modelos separados

m0 <- glm.nb(audience_n ~ 1, data=dp)
m1 <- glm.nb(audience_n ~ gender, data=dp)
m2 <- glm.nb(audience_n ~ pc1 + pc2, data=dp)

m3 <- glm.nb(audience_n ~ gender + pc1 + pc2, data=dp)

m4 <- glm.nb(audience_n ~ gender*pc1 + gender*pc2, data=dp)

AICtab(m0,m1,m2,m3,m4,
       base=T, weights=T) %>% kable(digits=2)
AIC dAIC df weight
m3 693.36 0.00 5 0.55
m2 695.53 2.17 4 0.19
m4 695.74 2.38 7 0.17
m1 697.16 3.80 3 0.08
m0 700.73 7.37 2 0.01

Residual diagnostic

Best model

hnp(m3)
## Negative binomial model (using MASS package)

plot(simulateResiduals(m3))

Model results

summary(m3)
## 
## Call:
## glm.nb(formula = audience_n ~ gender + pc1 + pc2, data = dp, 
##     init.theta = 5.219733971, link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5233  -0.7225  -0.2033   0.4610   3.3874  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  3.11594    0.11007  28.310   <2e-16 ***
## genderM      0.26225    0.12520   2.095   0.0362 *  
## pc1          0.07478    0.02511   2.978   0.0029 ** 
## pc2         -0.01698    0.03941  -0.431   0.6665    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(5.2197) family taken to be 1)
## 
##     Null deviance: 103.589  on 86  degrees of freedom
## Residual deviance:  89.206  on 83  degrees of freedom
## AIC: 693.36
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  5.220 
##           Std. Err.:  0.927 
## 
##  2 x log-likelihood:  -683.359
performance::r2(m3)
## # R2 for Generalized Linear Regression
##   Nagelkerke's R2: 0.219

We used the first and second PCA axis as predictors together with gender to explain the professor’s audience, and found that, as expected, audience increases with productivity index (first PCA axis) but female professors still presented average audience 1.3 times smaller than male professors (R2 of the best fitting model = 0.218941).

my3 <- ggpredict(m3, terms=c("pc1","gender"))
my3 <- as.data.frame(my3)
ggplot(my3, aes(x=x, y=predicted, col=group)) +
  geom_ribbon(aes(ymin=conf.low,ymax=conf.high, fill=group), alpha=0.3,
             colour = NA) +
    geom_line()+
  scale_color_manual(name="Gender",values = c("#6D57CF","#FCA532"))+
  scale_fill_manual(name="Gender",values = c("#6D57CF","#FCA532"))+
   theme_cowplot() + ggtitle("") +
  ylab("Audience (N)") + xlab("Productivity index (PC1 axis)")+
  geom_point(data=dp, aes(x=pc1, y=audience_n, col=gender), alpha=0.6)

ggsave("figures/audience_professor.jpeg", width=9, height = 6)  

PC 2 - nao importante, nao “significativo”“:

my3 <- ggpredict(m3, terms=c("pc2","gender"))
plot(my3) +
  scale_color_manual(values = c("#b2abd2", "#fdb863"))+
  scale_fill_manual(values = c("#b2abd2", "#fdb863"))+
   theme_cowplot()

Figure audience

prs <- as.data.frame(myg11b) %>% rename(affirm_action = facet,
                                        position_cat=x)
f1<- ggplot(data, aes(x=affirm_action, y=audience_n)) +
  geom_point(aes(col=gender), position = position_dodge(0.6), alpha=0.3,
             size=3,show.legend = F) +
  facet_grid(~position_cat) +
 #scale_color_manual(values = c("#b2abd2", "#fdb863")) +
  scale_color_manual(values = c("#6D57CF","#FCA532")) +
  scale_fill_manual(name="Gender", values = c("#6D57CF","#FCA532")) +
  geom_pointrange(data=prs, aes(x=affirm_action, y=predicted,fill=group,
                                ymax=conf.high, ymin=conf.low), alpha=1,
             position=position_dodge(0.6), size=1, shape=21, col="black") +
  ylab("Audience (N)")  +
  xlab("Affirmative actions")+
  labs(tag="A")
  


my3 <- ggpredict(m3, terms=c("pc1","gender"))
my3 <- as.data.frame(my3)
my3$prof <- "Professors only"
f2 <- ggplot(my3, aes(x=x, y=predicted, col=group)) +
  geom_ribbon(aes(ymin=conf.low,ymax=conf.high, fill=group), alpha=0.3,
             colour = NA) +
    geom_line(size=1.5)+
  facet_grid(~prof)+
  scale_color_manual(name="Gender",values = c("#6D57CF","#FCA532"))+
  scale_fill_manual(name="Gender",values = c("#6D57CF","#FCA532"))+
   theme_cowplot() + ggtitle("") +
  ylab("Audience (N)") + xlab("Productivity index (PC1 axis)")+
  geom_point(data=dp, aes(x=pc1, y=audience_n, col=gender), alpha=0.6,
             size=2)+
  theme(legend.position="none") +
  labs(tag="C")


f3<- plot_spacer()

design <- "
           1111
           #22#
"

f1 + f2  + 
  plot_layout( design=design, guides="collect")

ggsave("figures/FIG_audience.jpeg", width=9, height = 8)  
p1<- fviz_pca_biplot(pca1, col.ind = dp$gender, addEllipses=TRUE,
                      col.ind.sub="none",  geom="point",
                      repel = TRUE) +
  facet_grid(.~.)+
  geom_vline(xintercept = 0, linetype="dashed") + 
  geom_hline(yintercept = 0, linetype="dashed")+
  scale_color_manual(name="gender",values = c("#6D57CF","#FCA532"))+
  scale_shape(name="gender")+
  scale_fill_manual(name="gender",values = c("#6D57CF","#FCA532"))+
  labs(title="PCA professors' productivity", tag="B") +
  xlab("PC1 (52%)") + ylab("PC2 (21%)") +
  theme_cowplot() +
  theme(legend.position="none",
        plot.title = element_text(size=12, vjust=-5, hjust=0)) #+
  #coord_cartesian(clip = "off")+
  # scale_x_continuous(limits=c(-6,8), expand=c(0,0))+
  # scale_y_continuous(limits=c(-4,8), breaks=c(-4,-2,0,2,4,6))
  # annotate("rect", xmin=-6, xmax=8, ymin=7,ymax=8, fill="gray85")+
  #annotate("text",label="Professors'productivity", x=0, y=7.5, fill="gray85")



f1/(p1+f2) +plot_layout(guides="collect")

ggsave("figures/FIG_audience_test.jpeg", width=9, height = 8)